set.seed(12345)
source("ingest_data.R")
MODEL_CACHE <- "added-M_pop_A_team_team_repo_RDC_zi_sd025_gamma01"

Settings

Models are stored in the following directory, which must exist prior to knitting this document:

cat(normalizePath(paste(getwd(), dirname(cachefile(MODEL_CACHE)), sep="/"), mustWork = T))
## /home/app/.cache

The used cache directory can be controlled via the cache parameter to Rmd - it can be useful to experiment with this parameter if you Knit the document manually in RStudio.

Using a narrower gamma prior

We can experiment with a narrower shape prior by configuring it to (for instance) gamma(0.1, 0.1), and running the full model based on this prior. The conclusions and tendencies should be the same, or similar, but the choice of prior might affect how fast the model converges, and if there are diagnostic problems (e.g. divergent transitions).

d <- data |> select(y=INTROD,
                    A=A,
                    C=C,
                    D=D,
                    R=R,
                    team=committerteam,
                    repo=repo)
formula <- bf(y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo),
              zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo))
get_prior(data=d,
          family=zero_inflated_negbinomial,
          formula=formula)
##                    prior     class      coef     group resp dpar nlpar lb ub
##                   (flat)         b                                          
##                   (flat)         b         A                                
##                   (flat)         b         C                                
##                   (flat)         b         D                                
##                   (flat)         b         R                                
##                   lkj(1)       cor                                          
##                   lkj(1)       cor                team                      
##                   lkj(1)       cor           team:repo                      
##  student_t(3, -2.3, 2.5) Intercept                                          
##     student_t(3, 0, 2.5)        sd                                      0   
##     student_t(3, 0, 2.5)        sd                team                  0   
##     student_t(3, 0, 2.5)        sd         C      team                  0   
##     student_t(3, 0, 2.5)        sd         D      team                  0   
##     student_t(3, 0, 2.5)        sd Intercept      team                  0   
##     student_t(3, 0, 2.5)        sd         R      team                  0   
##     student_t(3, 0, 2.5)        sd           team:repo                  0   
##     student_t(3, 0, 2.5)        sd         C team:repo                  0   
##     student_t(3, 0, 2.5)        sd         D team:repo                  0   
##     student_t(3, 0, 2.5)        sd Intercept team:repo                  0   
##     student_t(3, 0, 2.5)        sd         R team:repo                  0   
##        gamma(0.01, 0.01)     shape                                      0   
##                   (flat)         b                            zi            
##                   (flat)         b         A                  zi            
##                   (flat)         b         C                  zi            
##                   (flat)         b         D                  zi            
##                   (flat)         b         R                  zi            
##           logistic(0, 1) Intercept                            zi            
##     student_t(3, 0, 2.5)        sd                            zi        0   
##     student_t(3, 0, 2.5)        sd                team        zi        0   
##     student_t(3, 0, 2.5)        sd         C      team        zi        0   
##     student_t(3, 0, 2.5)        sd         D      team        zi        0   
##     student_t(3, 0, 2.5)        sd Intercept      team        zi        0   
##     student_t(3, 0, 2.5)        sd         R      team        zi        0   
##     student_t(3, 0, 2.5)        sd           team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd         C team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd         D team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd Intercept team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd         R team:repo        zi        0   
##        source
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##  (vectorized)
##  (vectorized)
##       default
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
priors <- c(prior(normal(0, 0.5), class = Intercept),
            prior(normal(0, 0.25), class = b),
            prior(weibull(2, 0.25), class = sd),
            prior(weibull(2, 0.25), class = sd, group=team:repo),
            prior(lkj(2), class = cor),
            prior(normal(0, 0.5), class = Intercept, dpar=zi),
            prior(normal(0, 0.25), class = b, dpar=zi),
            prior(weibull(2, 0.25), class = sd, dpar=zi),
            prior(weibull(2, 0.25), class = sd, group=team:repo, dpar=zi),
            prior(gamma(0.1, 0.1), class = shape))

(v <- validate_prior(prior=priors,
               formula=formula,
               data=d,
               family=zero_inflated_negbinomial)
)
##                 prior     class      coef     group resp dpar nlpar lb ub
##       normal(0, 0.25)         b                                          
##       normal(0, 0.25)         b         A                                
##       normal(0, 0.25)         b         C                                
##       normal(0, 0.25)         b         D                                
##       normal(0, 0.25)         b         R                                
##       normal(0, 0.25)         b                            zi            
##       normal(0, 0.25)         b         A                  zi            
##       normal(0, 0.25)         b         C                  zi            
##       normal(0, 0.25)         b         D                  zi            
##       normal(0, 0.25)         b         R                  zi            
##        normal(0, 0.5) Intercept                                          
##        normal(0, 0.5) Intercept                            zi            
##  lkj_corr_cholesky(2)         L                                          
##  lkj_corr_cholesky(2)         L                team                      
##  lkj_corr_cholesky(2)         L           team:repo                      
##      weibull(2, 0.25)        sd                                      0   
##      weibull(2, 0.25)        sd                            zi        0   
##      weibull(2, 0.25)        sd                team                  0   
##      weibull(2, 0.25)        sd         C      team                  0   
##      weibull(2, 0.25)        sd         D      team                  0   
##      weibull(2, 0.25)        sd Intercept      team                  0   
##      weibull(2, 0.25)        sd         R      team                  0   
##      weibull(2, 0.25)        sd                team        zi        0   
##      weibull(2, 0.25)        sd         C      team        zi        0   
##      weibull(2, 0.25)        sd         D      team        zi        0   
##      weibull(2, 0.25)        sd Intercept      team        zi        0   
##      weibull(2, 0.25)        sd         R      team        zi        0   
##      weibull(2, 0.25)        sd           team:repo                  0   
##      weibull(2, 0.25)        sd         C team:repo                  0   
##      weibull(2, 0.25)        sd         D team:repo                  0   
##      weibull(2, 0.25)        sd Intercept team:repo                  0   
##      weibull(2, 0.25)        sd         R team:repo                  0   
##      weibull(2, 0.25)        sd           team:repo        zi        0   
##      weibull(2, 0.25)        sd         C team:repo        zi        0   
##      weibull(2, 0.25)        sd         D team:repo        zi        0   
##      weibull(2, 0.25)        sd Intercept team:repo        zi        0   
##      weibull(2, 0.25)        sd         R team:repo        zi        0   
##       gamma(0.1, 0.1)     shape                                      0   
##        source
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##          user
##          user
##  (vectorized)
##  (vectorized)
##          user
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user

The same structure as in \(\mathcal{M}_4\), but a single prior is changed to gamma(0.1, 0.1).

Prior Predictive Checks

M_ppc <- brm(data = d,
      family = zero_inflated_negbinomial,
      formula = formula,
      prior = priors,
      file = cachefile(paste0("ppc-",MODEL_CACHE)),
      warmup = 1000,
      iter  = ITERATIONS,
      chains = CHAINS,
      cores = CORES,
      sample_prior = "only",
      backend="cmdstanr",
      file_refit = "on_change",
      threads = threading(THREADS),
      save_pars = SAVE_PARS,
      adapt_delta = ADAPT_DELTA)
m <- M_ppc
yrep <- posterior_predict(m, newdata = d) # or nd

Number of zeros

ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0)) + ggtitle("Prior predicted proportion of zeros")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We see that the changed shape prior causes the model to be more skeptical of seeing duplicates—the most likely proportion of zeros in predicted data is now 1.0. But we also see that the model does allow between about 0.5–0.95 proportion of zeros, so the model does not rule out that duplicates appear.

Max predicted value

(sim_max <- ppc_stat(y = d$y, yrep, stat = "max") + ggtitle("Prior predicted max values")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scaling to more reasonable values

sim_max + scale_x_continuous(limits = c(0,1000)) + ggtitle("Prior predicted max values up to 1000")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The maximum predicted value is more spread out, compared to \(\mathcal{M}_4\) (lower peak at 0, slightly higher distribution at 1000), but the overall distribution shape looks similar.

99th percentile

ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Prior predicted Q99 vs. observed value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

At the 99th percentile, we again see that this model is more skeptical of seeing duplicates. The most likely value includes zero.

99th vs 95th percentile

p <- ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + theme(legend.position="bottom") #+ ggtitle("Prior predicted Q95 vs Q99")
p

The 2-d plot confirms the conclusion—95th percentile now ranges to about 20, whereas \(\mathcal{M}_4\) ranged to about 30.

Standard deviation

The standard deviation of the predictions is a bit harder to grasp intuitively. But our prior information encompass the observed value well.

(p <- ppc_stat(y = d$y, yrep, stat = "sd") + ggtitle("Prior predicted stddev vs. observed value")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Zooming in to show the distribution in more detail.

p + scale_x_continuous(limits=c(0,30))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Lower peak at zero, thicker tail around 20–30.

Group-level predictions

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per repository")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Model execution and diagnostics

M_pop_A_team_team_repo_zi_sd025_model <-
  brm(data = d,
      family = zero_inflated_negbinomial,
      file = cachefile(MODEL_CACHE),
      formula = formula,
      prior = priors,
      warmup = 1000,
      iter  = ITERATIONS,
      chains = CHAINS,
      cores = CORES,
      backend="cmdstanr",
      file_refit = "on_change",
      threads = threading(THREADS),
      save_pars = SAVE_PARS,
      adapt_delta = ADAPT_DELTA)
M_pop_A_team_team_repo_zi_sd025_model <- add_criterion(M_pop_A_team_team_repo_zi_sd025_model, "loo")
m <- M_pop_A_team_team_repo_zi_sd025_model
p <- mcmc_trace(m)
pars <- levels(p[["data"]][["parameter"]])
plots <- seq(1, to=length(pars), by=12)
lapply(plots, function(i) {
  start <- i
  end <- start+11
  mcmc_trace(m, pars = na.omit(pars[start:end]))
  })
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

## 
## [[58]]

## 
## [[59]]

## 
## [[60]]

## 
## [[61]]

## 
## [[62]]

## 
## [[63]]

## 
## [[64]]

## 
## [[65]]

## 
## [[66]]

## 
## [[67]]

## 
## [[68]]

The “caterpillar plots” show that the chains mixed well also for this model.

mcmc_plot(m, type="rhat")

mcmc_plot(m, type="rhat_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mcmc_plot(m, type="neff")

mcmc_plot(m, type="neff_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Just as for \(\mathcal{M}_4\), the MCMC chains, \(\hat{R}\) and \(N_{eff}\) ratio look good.

loo <- loo(m)
loo
## 
## Computed from 12000 by 31007 log-likelihood matrix.
## 
##          Estimate    SE
## elpd_loo  -8486.3 163.1
## p_loo       258.6  16.5
## looic     16972.5 326.2
## ------
## MCSE of elpd_loo is NA.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.1, 1.6]).
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. ESS
## (-Inf, 0.7]   (good)     30994 100.0%  674     
##    (0.7, 1]   (bad)         10   0.0%  <NA>    
##    (1, Inf)   (very bad)     3   0.0%  <NA>    
## See help('pareto-k-diagnostic') for details.
plot(loo)

There are 13 outlier points that need to be analysed (e.g. with reloo) (10 over 0.7 and 3 over 1.0)

Reloo is disabled by default, but could be enabled by setting the RELOO environment variable (which in turn will set the reloo parameter of this document.

reloofile <- cachefile(paste0("reloo-", MODEL_CACHE, ".rds"))
if (file.exists(reloofile)) {
    (reloo <- readRDS(reloofile))
} else {
    Sys.time()
    (reloo <- reloo(m, loo, chains=CHAINS, cores=CORES) )
    Sys.time()
    saveRDS(reloo, reloofile)
}
## 
## Computed from 12000 by 31007 log-likelihood matrix.
## 
##          Estimate    SE
## elpd_loo  -8492.7 163.8
## p_loo       265.0  18.9
## looic     16985.4 327.6
## ------
## MCSE of elpd_loo is 0.7.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.1, 1.6]).
## 
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.

Note that the reloo results are cached, and included in the docker image by default. If you want to reproduce the reloo manually, set a separate cache dir (and mount it into the docker image), and enable the RELOO environment variable. It will take more than one day to complete all reloo, so it is best done over a weekend.

# plotting the recalculated values
plot(reloo)

# which points have higher pareto_k than 0.5?
influencers <- data[loo::pareto_k_ids(reloo),]
influencers |> group_by(repo,committerteam) |> tally()
## # A tibble: 0 × 3
## # Groups:   repo [0]
## # ℹ 3 variables: repo <fct>, committerteam <fct>, n <int>

According to the reloo function, all Pareto-k values are below 0.7, so we should be able to trust the model inferences.

Posterior predictive checks

yrep <- posterior_predict(m)

Posterior proportion of zeros

ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution of zeros is marginally higher than the observed value, but it still encompasses the observed proportion.

Posterior max value

(sim_max <- ppc_stat(y = d$y, yrep, stat = "max")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sim_max + scale_x_continuous(limits = c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution of predicted max values, after training, for this model, is similar to the observed max value, but very similar to \(\mathcal{M}_4\).

Posterior standard distribution

ppc_stat(y = d$y, yrep, stat = "sd")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Comparing the x-axis scale, we see that this model is expecting slightly lower standard deviation (range goes to 30 instead of 40, as for\(\mathcal{M}_4\)).

Posterior Q99

ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Posterior predicted Q99")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Posterior Q95 vs Q99

ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + ggtitle("Posterior predicted Q95 vs Q99")

Identical to \(\mathcal{M}_4\).

Posterior grouped predictions

ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$team) + ggtitle("Posterior predictive max observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compared to \(\mathcal{M}_4\), this model is more likely to predict lower number of duplicates. So the team predicted max values will, for the most part, be lower than for \(\mathcal{M}_4\).

ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$repo) + ggtitle("Posterior predictive max observation per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Posterior max per repository have some variation, in particular in Saturn and Jupiter.

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + ggtitle("Posterior predictive 99% quartile per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The 99th percentile value predictions seem very well fitted. Predictions surround the observation nicely.

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + ggtitle("Posterior predictive 99% quartile per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Rootogram

(rootogram <- pp_check(m, type = "rootogram", style="suspended")
)
## Using all posterior draws for ppc type 'rootogram' by default.

We need to size the rootogram to see anything of value.

rootogram + scale_x_continuous(limits=c(0,50)) + ggtitle("Suspended rootogram, scaled to 0-50 prediction interval")
## Warning: Removed 5151 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 5149 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

Posterior predictions

The posterior predictions in general work well. Though the single outlier in Saturn is not picked up (as most other changes there yield very few duplicates). Though our priors does allow the outliers to shine though, they do relegate them as very unlikely (most max predictions are in the 10s or 20s, most).

source("predict_utils.R")
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates = mean(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = q99(data$COMPLEX), duplicates = q99(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = median(data$COMPLEX), duplicates = median(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity = quantile(data$COMPLEX, 0.25), duplicates = quantile(data$DUP, 0.25), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q95(x)), "Quantile 95%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q99(x)), "Quantile 99%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity=q95(data$COMPLEX), duplicates = q95(data$DUP), summary=function(x) q99(x)), "Quantile 99%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity=q99(data$COMPLEX), duplicates = q99(data$DUP), summary=function(x) quantile(x, 0.75)), "Quantile 75%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=quantile(data$ADD, 0.99), removed=quantile(data$DEL, 0.1), complexity=mean(data$COMPLEX), duplicates = mean(data$DUP), summary=function(x) median(x)), "Median a posteriori", decimals=0)

The various heatmaps all look similar to the findings from \(\mathcal{M}_4\).

summary(m)
##  Family: zero_inflated_negbinomial 
##   Links: mu = log; shape = identity; zi = logit 
## Formula: y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo) 
##          zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo)
##    Data: d (Number of observations: 31007) 
##   Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
##          total post-warmup draws = 12000
## 
## Group-Level Effects: 
## ~team (Number of levels: 11) 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept)              0.26      0.11     0.06     0.49 1.00     4307
## sd(R)                      0.17      0.08     0.04     0.36 1.00     5000
## sd(C)                      0.21      0.09     0.05     0.41 1.00     5415
## sd(D)                      0.11      0.07     0.02     0.27 1.00     6907
## sd(zi_Intercept)           0.15      0.08     0.03     0.34 1.00     6569
## sd(zi_R)                   0.26      0.11     0.06     0.49 1.00     4110
## sd(zi_C)                   0.17      0.09     0.03     0.39 1.00     6185
## sd(zi_D)                   0.16      0.09     0.03     0.35 1.00     6614
## cor(Intercept,R)           0.14      0.36    -0.59     0.77 1.00     7315
## cor(Intercept,C)           0.06      0.36    -0.64     0.71 1.00     8828
## cor(R,C)                   0.11      0.36    -0.62     0.75 1.00     6983
## cor(Intercept,D)           0.14      0.38    -0.62     0.79 1.00    10758
## cor(R,D)                  -0.02      0.38    -0.71     0.69 1.00    10901
## cor(C,D)                   0.01      0.38    -0.70     0.71 1.00     9948
## cor(zi_Intercept,zi_R)     0.00      0.38    -0.70     0.71 1.00     5113
## cor(zi_Intercept,zi_C)    -0.11      0.38    -0.76     0.65 1.00    10045
## cor(zi_R,zi_C)             0.09      0.38    -0.65     0.76 1.00     9622
## cor(zi_Intercept,zi_D)    -0.00      0.38    -0.71     0.71 1.00    10231
## cor(zi_R,zi_D)            -0.14      0.38    -0.79     0.62 1.00    10098
## cor(zi_C,zi_D)            -0.08      0.38    -0.76     0.66 1.00     9542
##                        Tail_ESS
## sd(Intercept)              5394
## sd(R)                      6091
## sd(C)                      5124
## sd(D)                      7278
## sd(zi_Intercept)           6996
## sd(zi_R)                   4407
## sd(zi_C)                   6619
## sd(zi_D)                   7077
## cor(Intercept,R)           8565
## cor(Intercept,C)           7845
## cor(R,C)                   7848
## cor(Intercept,D)           9931
## cor(R,D)                   9530
## cor(C,D)                   9799
## cor(zi_Intercept,zi_R)     7809
## cor(zi_Intercept,zi_C)     8980
## cor(zi_R,zi_C)             9225
## cor(zi_Intercept,zi_D)     8277
## cor(zi_R,zi_D)             9137
## cor(zi_C,zi_D)            10056
## 
## ~team:repo (Number of levels: 84) 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept)              0.47      0.07     0.33     0.62 1.00     4705
## sd(R)                      0.32      0.06     0.21     0.45 1.00     4724
## sd(C)                      0.34      0.09     0.17     0.52 1.00     2131
## sd(D)                      0.21      0.05     0.12     0.33 1.00     4860
## sd(zi_Intercept)           0.59      0.08     0.44     0.75 1.00     6019
## sd(zi_R)                   0.35      0.08     0.21     0.52 1.00     3296
## sd(zi_C)                   0.41      0.11     0.19     0.64 1.00     2260
## sd(zi_D)                   0.37      0.08     0.22     0.54 1.00     4877
## cor(Intercept,R)          -0.23      0.21    -0.62     0.19 1.00     4068
## cor(Intercept,C)          -0.15      0.23    -0.56     0.33 1.00     4881
## cor(R,C)                   0.25      0.24    -0.23     0.70 1.00     3738
## cor(Intercept,D)           0.01      0.25    -0.48     0.49 1.00     5685
## cor(R,D)                  -0.32      0.26    -0.76     0.25 1.00     5783
## cor(C,D)                  -0.28      0.29    -0.79     0.31 1.00     3871
## cor(zi_Intercept,zi_R)    -0.35      0.20    -0.73     0.07 1.00     4605
## cor(zi_Intercept,zi_C)    -0.47      0.20    -0.81    -0.02 1.00     4811
## cor(zi_R,zi_C)             0.17      0.29    -0.40     0.72 1.00     3166
## cor(zi_Intercept,zi_D)     0.07      0.22    -0.38     0.49 1.00     5450
## cor(zi_R,zi_D)             0.15      0.26    -0.36     0.65 1.00     4529
## cor(zi_C,zi_D)             0.01      0.30    -0.53     0.60 1.00     2627
##                        Tail_ESS
## sd(Intercept)              7535
## sd(R)                      7094
## sd(C)                      5005
## sd(D)                      6323
## sd(zi_Intercept)           8211
## sd(zi_R)                   4887
## sd(zi_C)                   4044
## sd(zi_D)                   7478
## cor(Intercept,R)           6362
## cor(Intercept,C)           7490
## cor(R,C)                   6218
## cor(Intercept,D)           7932
## cor(R,D)                   7973
## cor(C,D)                   7155
## cor(zi_Intercept,zi_R)     6227
## cor(zi_Intercept,zi_C)     7206
## cor(zi_R,zi_C)             4899
## cor(zi_Intercept,zi_D)     7782
## cor(zi_R,zi_D)             6594
## cor(zi_C,zi_D)             5085
## 
## Population-Level Effects: 
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       -1.51      0.15    -1.80    -1.21 1.00     6115     7022
## zi_Intercept     1.90      0.16     1.59     2.20 1.00     5561     7741
## A                1.07      0.05     0.98     1.16 1.00     7191     8158
## R                0.09      0.09    -0.09     0.28 1.00     6542     8155
## C                0.30      0.12     0.05     0.53 1.00     5223     6334
## D                0.29      0.08     0.12     0.45 1.00     6727     7953
## zi_A            -1.16      0.06    -1.27    -1.05 1.00     8284     8699
## zi_R             0.43      0.13     0.16     0.65 1.00     6211     8056
## zi_C             0.33      0.14     0.05     0.59 1.00     4878     5347
## zi_D            -0.34      0.11    -0.54    -0.13 1.00     6809     8007
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape     1.04      0.09     0.88     1.22 1.00     9149     8957
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
source("conditional_effects.R")
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter")

(red_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Jupiter") + labs(title=NULL, subtitle=NULL, legend=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() +  scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(red_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() + ylab(NULL) + scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(red_plot <- ggarrange(red_jupiter_plot, red_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Green", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Green", "Jupiter")

(blue_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + theme(legend.position = "none") + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(blue_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + ylab(NULL) + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(blue_plot <- ggarrange(blue_jupiter_plot, blue_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(complexity_plot <- ggarrange(plotlist = list(blue_jupiter_plot, blue_uranus_plot, red_jupiter_plot, red_uranus_plot), labels="AUTO", nrow=2, ncol=2, common.legend = T, legend="bottom") 
 ) 
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(red_venus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Venus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Venus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Neptune", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Neptune") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Saturn", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Saturn") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "IntTest", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "IntTest") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mars", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mars") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mercury", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mercury") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

The conditional effects plot also all look similar to \(\mathcal{M}_4\).

data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(desc(n))
## # A tibble: 7 × 2
##   INTROD     n
##    <int> <int>
## 1      0   968
## 2      1    44
## 3      2    23
## 4      3     5
## 5      4     4
## 6      5     2
## 7     17     1
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 13 × 2
##    INTROD     n
##     <int> <int>
##  1      0  2080
##  2      1    35
##  3      2    18
##  4      3     2
##  5      4     9
##  6      5     4
##  7      6     3
##  8      7     8
##  9      8     2
## 10      9     2
## 11     19     1
## 12     22     1
## 13     23     1
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 6 × 2
##   INTROD     n
##    <int> <int>
## 1      0   214
## 2      1     5
## 3      2     7
## 4      5     2
## 5      6     1
## 6     11     2
data |> filter(committerteam=="Red", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 5 × 2
##   INTROD     n
##    <int> <int>
## 1      0   348
## 2      1    19
## 3      2    11
## 4      3     7
## 5      4     1
data |> filter(committerteam=="Red", repo=="Venus") |> tally()
##     n
## 1 231
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 32 × 2
##    `trunc(COMPLEX/5)`     n
##                 <dbl> <int>
##  1                  0    40
##  2                  1    18
##  3                  2    19
##  4                  3    26
##  5                  4    19
##  6                  5    16
##  7                  6     8
##  8                  7     2
##  9                  8     5
## 10                  9     7
## # ℹ 22 more rows
data |> filter(committerteam=="Red", repo=="Jupiter") |> tally()
##      n
## 1 2166
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 89 × 2
##    `trunc(COMPLEX/5)`     n
##                 <dbl> <int>
##  1                  0   413
##  2                  1   301
##  3                  2   223
##  4                  3   179
##  5                  4   131
##  6                  5   138
##  7                  6    77
##  8                  7    67
##  9                  8    62
## 10                  9    33
## # ℹ 79 more rows
data |> filter(committerteam=="Red") |> group_by(repo) |> tally() |> arrange(desc(n))
## # A tibble: 8 × 2
##   repo        n
##   <fct>   <int>
## 1 Jupiter  2166
## 2 Saturn   1191
## 3 IntTest   617
## 4 Uranus    386
## 5 Venus     231
## 6 Neptune   215
## 7 Mercury   211
## 8 Mars      130
data |> filter(committerteam=="Blue", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
##   INTROD     n
##    <int> <int>
## 1      0   654
## 2      1     8
## 3      2     4
## 4      3     1
## 5      4     1
## 6      5     1
## 7      8     1
data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
##   INTROD     n
##    <int> <int>
## 1      0   968
## 2      1    44
## 3      2    23
## 4      3     5
## 5      4     4
## 6      5     2
## 7     17     1
data |> filter(COMPLEX==max(COMPLEX))
##      repo                                   commit file ISTEST ISNEW ISREMOVED
## 1 Jupiter 52efadf06bb61941887114298aa281c277e1f8a0  910  FALSE FALSE     FALSE
## 2 Jupiter 6de4ea0ed3207a898be965f3901efc5f0339bc0e  910  FALSE FALSE     FALSE
##                                     author authorteam
## 1 8ff0b36b3305550dc55c9097f3b970c599959326     Yellow
## 2 230d4e806215564296e126c9b532f1936adaaa89        Red
##                                  committer committerteam churn addedComplexity
## 1 0e33fff6868e21d94d344c4bf2fcd81285b2c3e2          Arch     3               0
## 2 230d4e806215564296e126c9b532f1936adaaa89           Red   113              10
##   removedComplexity cADD cDEL cSIZE cINTROD cREMOVED ADD DEL REASON CLOC
## 1                 0  582  879  1461       0        0   3   3      I 9443
## 2                 0  135   34   169       0        0 113  32      F 9443
##   COMPLEX DUP  logcADD  logcDEL logcSIZE logcREMOVED   logADD   logDEL
## 1    1244  25 6.368187 6.779922 7.287561           0 1.386294 1.386294
## 2    1244  25 4.912655 3.555348 5.135798           0 4.736198 3.496508
##   logCOMPLEX   logDUP INTROD REMOVED          A          R        C  scaleDUP
## 1   7.126891 3.258097      0       0 -0.4751701 -0.1458825 2.537155 0.5413155
## 2   7.126891 3.258097      0       0  1.6152839  1.2642792 2.537155 0.5413155
##          D         cA        cR        cSZ       cREM   cREMlog
## 1 2.058662 0.67129576  1.044691  0.6849683 -0.2146616 -0.455959
## 2 2.058662 0.03418981 -0.252292 -0.3619011 -0.2146616 -0.455959
teamBlue <- condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter")
teamArch <- condeffect_logADD_by_logCOMPLEX(m, d, "Arch", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamBlue)

teamBlue |> mutate(added=unscale_added(A), complexity=factor(trunc(unscale_complexity(as.integer(C))))) |> select(complexity) |> summary()
##  complexity
##  14 :6000  
##  489:4000
d |> sample_n(10) |> select(y, A, C, team, repo) |> mutate(foo = trunc(unscale_complexity(round(C))))
##    y          A           C   team    repo foo
## 1  0 -0.2221461  0.55867729  Brown  Saturn  85
## 2  0  0.5291739 -1.16528154   Blue  Saturn   2
## 3  0 -0.9077175 -0.76639503  Green Jupiter   2
## 4  0  0.2603507 -1.56416806    Red  Saturn  -1
## 5  0 -1.3402650  1.15630936   Pink  Saturn  85
## 6  0 -0.1259509  1.85875713  Green IntTest 489
## 7  0  1.3109406 -0.76639503  Green    Mars   2
## 8  0  0.5886506 -0.04546552 Yellow  Uranus  14
## 9  0  1.6472927  0.97177370   Arch IntTest  85
## 10 0 -0.6546935  0.41199404 Yellow IntTest  14
  #mutate(truncC=factor(trunc(unscale_complexity(round(C, 0)))), added=unscale_added(A), complexity=factor(trunc(unscale_complexity(round(C, 0))))) |> select(complexity) 
plot_logADD_by_logCOMPLEX(m, d, teamArch)

teamBrown <- condeffect_logADD_by_logCOMPLEX(m, d, "Brown", "IntTest")
plot_logADD_by_logCOMPLEX(m, d, teamBrown)

teamUI <- condeffect_logADD_by_logCOMPLEX(m, d, "UI", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamUI)

(p <- mcmc_areas(m, regex_pars = c("^b_")) + theme_bw() + ggtitle("Population-level beta parameter distributions"))

Not surprising, the population-level betas are similar to \(\mathcal{M}_4\).

mcmc_areas(m, regex_pars = c("^sd_"))

mcmc_areas(m, regex_pars = c("^r_team[[].*,Intercept"))

mcmc_areas(m, regex_pars = c("^r_team.*[[]Red,.*[]]"))

mcmc_areas(m, regex_pars = c("^r_team:repo.*[[].*Red_IntTest,.*[]]"))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "IntTest", robust=T)) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Neptune")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Uranus")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "Jupiter")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Neptune", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,215))
)

(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)

(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))+ scale_y_continuous(limits=c(0,175))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Jupiter", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

jupiter_stats <- data |> filter(repo == "Jupiter") |> summarize(remQ95 = q95(DEL), compQ95 = q95(COMPLEX), remQ99 = q99(DEL), compQ99 = q99(COMPLEX))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Jupiter", removed=jupiter_stats$remQ95, complexity = jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "IntTest", removed=jupiter_stats$remQ95, complexity=jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + scale_y_continuous(limits = c(0,400))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "IntTest", removed=q95(data$DEL), complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "NewTeam", "Saturn")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Saturn", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Saturn", removed=q95(data$DUP), duplicates = q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Orange", "Venus", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Yellow", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

All the conditional effects plot are virtually the same for this model and \(\mathcal{M}_4\). Which is natural, given that the posterior population-level predictors were having very similar distributions.

source("predict_utils.R")

Q95 change in a highly complex file

(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Jupiter", "IntTest", "Aniara", "Venus"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.25,1.0)) + scale_x_continuous(limits = c(0,30)) )
## Warning: Removed 4714 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

duplicates_probability_for_team <- function(predictions, aTeam) {
  params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
  stopifnot(length(params$added) == 1)
  predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred0/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + 
    ggtitle(paste("Probability of any duplicate per repository for team", aTeam), 
            paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates, 0))) + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 0)") + scale_y_continuous(limits=c(0,1))
}

more_five_probability_for_team <- function(predictions, aTeam) {
  params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
  stopifnot(length(params$added) == 1)
  predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred5/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + 
    ggtitle(paste("Probability of more than five duplicates per repository for team", aTeam), 
            paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates,0))) + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 5)") + scale_y_continuous(limits=c(0,1))
}
pred <- onepred(added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))
(p <- duplicates_probability_for_team(pred, "Blue"))

(p <- more_five_probability_for_team(pred, "Blue"))

Observed number of introduced duplicates.

(p <- data |> filter(repo == MARS, committerteam == GREEN) |> ggplot(aes(x=DUP, y=COMPLEX, size=INTROD, color=INTROD)) + geom_point() + geom_jitter() + scale_y_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100, 250, 500, 750), trans="log1p") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_color_distiller(palette = "Spectral") + theme_bw()
)

data |> filter(repo == MARS) |> group_by(committerteam) |> filter(INTROD == max(INTROD)) |> select(committerteam, INTROD, DUP) |> arrange(desc(INTROD))
## # A tibble: 28 × 3
## # Groups:   committerteam [10]
##    committerteam INTROD   DUP
##    <fct>          <int> <int>
##  1 Brown             24    86
##  2 Blue              14    72
##  3 Green             10     0
##  4 Violet             5     0
##  5 Yellow             2     1
##  6 Yellow             2     1
##  7 Red                2     0
##  8 Yellow             2     0
##  9 Orange             1     1
## 10 Orange             1     1
## # ℹ 18 more rows
(p <- data |> filter(repo == MARS, committerteam == GREEN) |> group_by(INTROD) |> ggplot(aes(x=DUP, group=INTROD, fill=INTROD)) + geom_bar(position="stack") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_fill_distiller(palette = "Spectral") + theme_bw()
)

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest", "Mars"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.50,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 3072 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

Letting the teams work in new repo

allrepos <- c("Aniara", "Jupiter", "IntTest", "Mercury", "Venus", "Mars", "Saturn", "Neptune", "Uranus")

(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Brown"), repo=allrepos, added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.20,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 10264 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=median(data$DUP), complex=median(data$COMPLEX))) + scale_y_continuous(limits = c(0.70,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=20, complex=170)) + scale_y_continuous(limits = c(0.35,1)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 1176 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

data |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
##   q95(ADD) q95(DEL) q95(COMPLEX) q95(DUP)
## 1      143       92          282       36
data |> group_by(repo) |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## # A tibble: 8 × 5
##   repo    `q95(ADD)` `q95(DEL)` `q95(COMPLEX)` `q95(DUP)`
##   <fct>        <dbl>      <dbl>          <dbl>      <dbl>
## 1 IntTest       216       127.           558          110
## 2 Jupiter       120        74.4          177           19
## 3 Mars          146.      114            154.           5
## 4 Mercury       144       106            282            6
## 5 Neptune       231       156            180            9
## 6 Saturn        101.       72.6          320            9
## 7 Uranus        158.       95.9           89.9          4
## 8 Venus         154        84.5          132            4
data |> group_by(repo) |> summarize(q99(ADD), q99(DEL), q99(COMPLEX), q99(DUP))
## # A tibble: 8 × 5
##   repo    `q99(ADD)` `q99(DEL)` `q99(COMPLEX)` `q99(DUP)`
##   <fct>        <dbl>      <dbl>          <dbl>      <dbl>
## 1 IntTest       638.       511.           633       620. 
## 2 Jupiter       298.       246.          1056        32  
## 3 Mars          316.       557.           661        72  
## 4 Mercury       304.       283.           374.       16.9
## 5 Neptune       511        474            216        28  
## 6 Saturn        239        209.           758        18  
## 7 Uranus        291.       273.           126.        9  
## 8 Venus         433        374.           260        11

Strange that teams in the Neptune repo seems very simlar for this change? Neptune repo is mostly static, no team owning it. Mostly abandoned code.

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "Venus"), added=q95(data$ADD), removed=q95(data$DEL))) + scale_y_continuous(limits = c(0.6,1)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 344 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "IntTest"), added=mean(data$ADD), removed=mean(data$DEL))) + scale_y_continuous(limits = c(0.2,1.02)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates=mean(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 327 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.3,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2081 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2427 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 6215 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q99(data$ADD), removed=q99(data$DEL), complexity = median(data$COMPLEX), duplicates=median(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 2585 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).